Version: 1.2 (Jupytext, time measurements, logger, param notebook execution, fixes)
Please put your comments about the notebook functionality here.
import sys
import os
sys.path+=[os.path.join(os.getcwd(), ".."), os.path.join(os.getcwd(), "../..")] # one and two up
ToC Necessary libraries for notebook functionality:
NOTE: This way, using the function, the button works only in active notebook. If the functionality needs to be preserved in html export, then the code has to be incluced directly into notebook.
LOGGER_CONFIG_NAME = "logger_file_limit_console"
ADDAPT_WIDTH = False
try:
from src.utils.notebook_support_functions import create_button, get_notebook_name
NOTEBOOK_NAME = get_notebook_name()
SUPPORT_FUNCTIONS_READ = True
except:
NOTEBOOK_NAME = "NO_NAME"
SUPPORT_FUNCTIONS_READ = False
from src.utils.logger import Logger
from src.utils.envs import Envs
from src.utils.config import Config
from pandas import options
from IPython.display import display, HTML
options.display.max_rows = 500
options.display.max_columns = 500
envs = Envs()
envs.set_logger(LOGGER_CONFIG_NAME)
Logger().start_timer(f"NOTEBOOK; Notebook name: {NOTEBOOK_NAME}")
if ADDAPT_WIDTH:
display(HTML("<style>.container { width:100% !important; }</style>")) # notebook width
2024-02-08 10:04:46,699 - file_limit_console - INFO - Logger was created on WS-3000 in branche 019_unify_pipelines_inputs. 2024-02-08 10:04:46,701 - file_limit_console - INFO - Process: NOTEBOOK; Notebook name: data_frame_explorer_documentation.py; Timer started;
# create_button()
from datetime import datetime
from importlib import reload
from pandas import DataFrame
from numpy.random import choice, randn, seed
from src.utils.date_time_functions import create_datetime_id
import src.data.df_explorer as DFE
# from src.global_constants import * # Remember to import only the constants in use
N_ROWS_TO_DISPLAY = 2
FIGURE_SIZE_SETTING = {"autosize": False, "width": 2200, "height": 750}
DATA_PROCESSING_CONFIG_NAME = "data_processing_basic"
# MANDATORY FOR CONFIG DEFINITION AND NOTEBOOK AND ITS OUTPUTS IDENTIFICATION #########################################
PYTHON_CONFIG_NAME = "python_local"
ID = create_datetime_id(now=datetime.now(), add_micro=False)
# (END) MANDATORY FOR CONFIG DEFINITION AND NOTEBOOK AND ITS OUTPUTS IDENTIFICATION ###################################
envs.set_config(PYTHON_CONFIG_NAME)
df = DataFrame()
n = 100
seed(876)
df["sex"] = choice(["male", "female"], n)
df["number"] = randn(n)
df.head()
| sex | number | |
|---|---|---|
| 0 | male | -0.087944 |
| 1 | female | -1.078677 |
| 2 | female | -1.269129 |
| 3 | female | -0.616576 |
| 4 | female | -0.390710 |
reload(DFE)
df_explorer = DFE.DFExplorer()
df_explorer.print_info_about_data_frame(df=df)
DataFrame type: <class 'pandas.core.frame.DataFrame'>
DataFrame shape: (100, 2)
DataFrame dtypes: {'sex': 'object', 'number': 'float64'}
DataFrame head:
sex number
0 male -0.087944
1 female -1.078677
2 female -1.269129
3 female -0.616576
4 female -0.390710
DataFrame description:
number
count 100.000000
mean -0.109604
std 1.137798
min -3.088058
25% -0.780472
50% -0.143400
75% 0.600610
max 3.264726
df_explorer.get_df_types(df=df)
{'sex': 'object', 'number': 'float64'}
df_explorer.get_memory_usage(df=df, attr_name="number", list_dtypes=["float64", "float32", "float16"])
Memory usage for attribute: number Attribute Name: number Measured dtype: float64 Memory Usage: 928 Attribute Name: number Measured dtype: float32 Memory Usage: 528 Attribute Name: number Measured dtype: float16 Memory Usage: 328
df_explorer.get_nan_stats(df=df, fraction=True)
DataFrame shape: (100, 2) Total number of NaN values: 0 NaN values per Attribute: Attribute NaN values Fraction ----------- ------------ ---------- sex 0 0 number 0 0 TOTAL 0 0
df_explorer.get_nan_stats(df=df, fraction=False)
DataFrame shape: (100, 2) Total number of NaN values: 0 NaN values per Attribute: sex 0 number 0 dtype: int64
df_explorer.print_attr_stats(df=df)
Attribute Name: sex Attribute type: object Number of Null values: 0 Number of unique values is:2 Percentage of unique values is: 0.02 Summation of unique values per ID: male 55 female 45 Name: sex, dtype: int64
############################################# Attribute Name: number Attribute type: float64 Number of Null values: 0 Number of unique values is:100 Percentage of unique values is: 1.0
#############################################
data_1 = [
[1., 2., 3.],
[3., 2., 1.],
[4., 5., 2.]
]
data_2 = [
[1., 2., 3.],
[3., 10., 1.],
[4., 5., 2.]
]
attr_names = ["NUMBER_1", "NUMBER_2", "NUMBER_3"]
df_1 = DataFrame(data_1, columns=attr_names)
df_2 = DataFrame(data_2, columns=attr_names)
# identical data frames
df_explorer.compare_attributes_in_data_frames(df_1, df_1, attr_names)
Are DFs equal in pandas? True Checking Overall Sums for DFs - Sum of First List is: 23.0 - Sum of Second List is: 23.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0 Checking for Attribute: NUMBER_1 - Sum of First List is: 8.0 - Sum of Second List is: 8.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0 Checking for Attribute: NUMBER_2 - Sum of First List is: 9.0 - Sum of Second List is: 9.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0 Checking for Attribute: NUMBER_3 - Sum of First List is: 6.0 - Sum of Second List is: 6.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0
# not identical data frames
df_explorer.compare_attributes_in_data_frames(df_1, df_2, attr_names)
Are DFs equal in pandas? False Checking Overall Sums for DFs - Sum of First List is: 23.0 - Sum of Second List is: 31.0 - Subtraction of Sums of Lists is: -8.0 - Percentage of Difference (1-2)/1 is: -0.34782608695652173 - Percentage of Difference (1-2)/2 is: -0.25806451612903225 Checking for Attribute: NUMBER_1 - Sum of First List is: 8.0 - Sum of Second List is: 8.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0 Checking for Attribute: NUMBER_2 - Sum of First List is: 9.0 - Sum of Second List is: 17.0 - Subtraction of Sums of Lists is: -8.0 - Percentage of Difference (1-2)/1 is: -0.8888888888888888 - Percentage of Difference (1-2)/2 is: -0.47058823529411764 Checking for Attribute: NUMBER_3 - Sum of First List is: 6.0 - Sum of Second List is: 6.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0
Logger().end_timer()
2024-02-08 10:04:47,354 - file_limit_console - INFO - Process: NOTEBOOK; Notebook name: data_frame_explorer_documentation.py; Timer ended; Process Duration [s]: 0.65; Process Duration [m]: 0.01